1) Given a file of FASTA sequences (you can use 'python.fasta' as an example), split it in (approximately) half, creating two files ('seq1.fasta' and 'seq2.fasta').

a) Use seq_utils.count_seqs() as part of the solution.

b) Do it without counting the sequences first.


In [10]:
import sys
sys.path.append('/home/pvh/Documents/counting_sequences/lib')
import seq_utils
import os

def split_fasta(filename):
    try:
        input_file = open(filename)
    except IOError as e:
        print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
        return False
    else:
        with input_file:
            seq_count = seq_utils.count_seqs(input_file)
            input_file.seek(0)
            half = seq_count // 2
            count = 0
            filename = 'seq1.fasta'
            output_file = open(filename, 'w')
            for line in input_file:
                if line.lstrip().startswith('>'):
                    count += 1
                    if count == half+1:
                        # change over to seq2.fasta at halfway point
                        filename = 'seq2.fasta'
                        output_file.close()
                        output_file = open(filename, 'w')
                output_file.write(line)
        output_file.close()
        return True
    
print os.getcwd()
split_fasta('python.fasta')


/home/pvh/Documents/python
Out[10]:
True

In [13]:
import sys

def split_fasta2(filename):
    try:
        input_file = open(filename)
    except IOError as e:
        print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
        return False
    else:
        filename1 = 'seq1.fasta'
        filename2 = 'seq2.fasta'
        output_file1 = open(filename1, 'w')
        output_file2 = open(filename2, 'w')
        filenum = 1
        with input_file:
            for line in input_file:
                if line.startswith('>'):
                    if filenum == 1:
                        filenum = 2
                        output_file = output_file2
                    elif filenum == 2:
                        filenum = 1
                        output_file = output_file1
                output_file.write(line)
        output_file1.close()
        output_file2.close()
        return True
split_fasta2('python.fasta')


Out[13]:
True

In [35]:
import sys

def split_fasta3(filename):
    try:
        input_file = open(filename)
    except IOError as e:
        print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
        return False
    else:
        output_files = []
        num_parts = 2
        for partnum in range(1, num_parts+1):
            filename = 'seq{}.fasta'.format(partnum)
            output_file = open(filename, 'w')
            output_files.append(output_file)
        count = 0
        with input_file:
            for line in input_file:
                if line.startswith('>'):
                    count += 1
                    filenum = count % num_parts
                    output_file = output_files[filenum]
                output_file.write(line)
        for output_file in output_files:
            output_file.close()
        return True
split_fasta3('python.fasta')


Out[35]:
True

2) Expand the solution from 1) to split the file into any number of pieces - i.e. the input will now be a filename and the number of parts. You can experiment in iPython first but the final version should be a script split_fasta.py that takes two command line arguments.


In [26]:
import sys
sys.path.append('/home/pvh/Documents/python/lib')
import seq_utils
import os

def get_output_file(part_num):
    filename = 'seq{}.fasta'.format(part_num)
    output_file = open(filename, 'w')
    return output_file

# by default num_parts is equal to two
def split_fasta4(filename, num_parts=2):
    
    try:
        input_file = open(filename)
    except IOError as e:
        print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
        return False
    else:
        with input_file:
            seq_count = seq_utils.count_seqs(input_file)
            if seq_count < num_parts:
                print >>sys.stderr, "Can't split, number of parts ({}) is greater than number of sequences in file ({})".format(num_parts, seq_count)
                return False
            input_file.seek(0)
            chunk_size = seq_count // num_parts
            count = 0
            part_num = 1
            output_file = get_output_file(part_num)
            for line in input_file:
                if line.lstrip().startswith('>'):
                    count += 1
                    if part_num < num_parts and count == (part_num*chunk_size)+1:
                        # change over to writing the next part
                        part_num += 1
                        output_file = get_output_file(part_num) 
                output_file.write(line)
        output_file.close()
        return True
    
print os.getcwd()
split_fasta('small.fasta', 6)


/home/pvh/Documents/python
Can't split, number of parts (6) is greater than number of sequences in file (5)
Out[26]:
False

In [29]:
import sys

def split_fasta5(filename, num_parts=2):
    try:
        input_file = open(filename)
    except IOError as e:
        print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
        return False
    else:
        output_files = []
        for partnum in range(1, num_parts+1):
            filename = 'seq{}.fasta'.format(partnum)
            output_file = open(filename, 'w')
            output_files.append(output_file)
        count = 0
        with input_file:
            for line in input_file:
                if line.startswith('>'):
                    count += 1
                    filenum = (count - 1) % num_parts
                    output_file = output_files[filenum]
                output_file.write(line)
        for output_file in output_files:
            output_file.close()
        return True
split_fasta5('python.fasta', 3)


Out[29]:
True

In [34]:
import sys

def split_fasta6(filename, num_parts=2):
    try:
        input_file = open(filename)
    except IOError as e:
        print >>sys.stderr, "Failed to open {}: {}".format(filename, e.strerror)
        return False
    else:
        output_files = []
        count = 0
        with input_file:
            for line in input_file:
                if line.startswith('>'):
                    count += 1
                    filenum = (count - 1) % num_parts
                    if len(output_files)-1 < filenum:
                        filename = 'seq{}.fasta'.format(filenum+1)
                        output_file = open(filename, 'w')
                        output_files.append(output_file)
                    else:
                        output_file = output_files[filenum]
                output_file.write(line)
        for output_file in output_files:
            output_file.close()
        if count < num_parts:
            print >>sys.stderr, "Warning: number of parts ({}) is greater than number of sequences in file ({})".format(num_parts, count)
            return False        
        return True
split_fasta6('small.fasta', 6)


Warning: number of parts (6) is greater than number of sequences in file (5)
Out[34]:
False

In [ ]: